#!/bin/bash

#Variables
READ1="/path-to-Read1.fastq.gz" 					#complete path to Read1 including file
READ2="/path-to-Read2.fastq.gz" 					#complete path to Read2 including file
OUTPUTDIR="/path-to-output-directory/"					#complete path to output folder
REF="/Users/ajtulloch/MPRA/output/Assoc/" 				#complete path to OUTPUTDIR of Association Script
CHROM="/path-to-chrom-sizes/mm10.chrom.sizes" 				#complete path to chrom sizes for genome 

WHITELIST="/../Reference/output_generated_whitelist.txt"		#complete path to WHITELIST
KEY_FILE="/Users/ajtulloch/MPRA/Reference/Library_Key.txt"		#complete path to Library_Key.txt, 3 columns: Gene-of-Interest:Library BC (Read1 orientation):Path to OUTPUTDIR of Association Script
CONCATENATION_FILE="$OUTPUTDIR/Temp/BAC_Concat.txt"			
CONCATENATION_SCRIPT="/path-to-python-script/Reference_Concatenation.py" #complete path to Reference_Concatenation script

FINAL_DIR="${OUTPUTDIR}/Final"
SPLIT_DIR="${OUTPUTDIR}/Temp/Individual_Libraries"

#Create Directories
mkdir $OUTPUTDIR
mkdir $OUTPUTDIR/Temp
mkdir -p "$FINAL_DIR"
mkdir -p "$SPLIT_DIR"


#Switch directories
cd $OUTPUTDIR

#Export bbduk dir to path
export PATH="/Users/ajtulloch/MPRA/bbmap:$PATH"				#complete path to BBMAP


#Extract BC from Read2 and convert to counts
umi_tools extract -p NNNNNNNNNNNNNNNNNNNNNNNNCCCCCCCC -I $READ1 -S "${OUTPUTDIR}/BC_Extract_R1.fastq.gz" --whitelist "$WHITELIST" --error-correct-cell

#Reformat Extracted Library and Fragment Barcodes into a tab-delimited file
gunzip -c ${OUTPUTDIR}/BC_Extract_R1.fastq.gz | \
awk 'NR % 4 == 1 {split($1, parts, "_"); if (length(parts) > 1) {print parts[2]"_"parts[3]}}' | \
sort -k1,1 > ${OUTPUTDIR}/Temp/Extracted_Barcodes.tsv

# Create Concatenated File from Multiple BAC Libraries
python3 "$CONCATENATION_SCRIPT" "$KEY_FILE" "$CONCATENATION_FILE"

# Check if the script ran successfully
if [ $? -eq 0 ]; then
    echo "Concatenation completed successfully. Output saved to $CONCATENATION_FILE."
else
    echo "Error occurred during concatenation. Check the logs above."
    exit 1
fi

#Sort Concatenated Fragment/BC file
sort -k6,6 $CONCATENATION_FILE > $OUTPUTDIR/Temp/BAC_Concat_Sorted.txt

#Merge BC to Key
join -t $'\t' -1 6 -2 1 ${OUTPUTDIR}/Temp/BAC_Concat_Sorted.txt ${OUTPUTDIR}/Temp/Extracted_Barcodes.tsv > ${OUTPUTDIR}/Temp/Merged.txt

#Split into separate files
cd $OUTPUTDIR/Temp/Individual_Libraries

awk -F '\t' 'BEGIN {OFS="\t"} {
    split($1, parts, "_");
    print $2, $3, $4, $5 > (parts[1] ".txt");
}' ${OUTPUTDIR}/Temp/Merged.txt

cd

# Loop over all .txt files in the split directory
for file in ${SPLIT_DIR}/*.txt; do
    # Extract the base name (e.g., ABC from ABC.txt)
    base_name=$(basename "$file" .txt)
    echo "Processing file: $file (library: $base_name)"

    # Parse Key_File to find the corresponding REF.bam path
    ref_bam_path=$(awk -F '\t' -v key="$base_name" '$2 == key {print $3 "/Final/Count_Expand.bam"}' "$KEY_FILE")

    # Check if REF.bam path was found
    if [ -z "$ref_bam_path" ]; then
        echo "Error: REF.bam file for $base_name not found in Key_File."
        continue
    fi

    # Create a subfolder for this library within Final
    library_output_dir="${FINAL_DIR}/${base_name}"
    mkdir -p "$library_output_dir"

    # Set temporary BAM file paths for this specific file
    experiment_bam="${SPLIT_DIR}/${base_name}_Experiment_Expand.bam"
    experiment_bw_log2="${library_output_dir}/${base_name}-log2ratio.bw"
    experiment_bw_ratio="${library_output_dir}/${base_name}-ratio.bw"

    # Convert into BAM file
    cat "$file" | \
    bedtools bedtobam -g "$CHROM" -i stdin | \
    samtools sort -o "$experiment_bam"

    # Index the BAM file
    samtools index "$experiment_bam"

    # Normalize Observed Reads to Library
    bamCompare -p 8 -bs 1 -b1 "$experiment_bam" -b2 "$ref_bam_path" -o "$experiment_bw_log2"

    bamCompare -p 8 -bs 1 --operation ratio -b1 "$experiment_bam" -b2 "$ref_bam_path" -o "$experiment_bw_ratio"

    echo "Finished processing library: $base_name"
done

echo "All libraries processed successfully."


